packages <- c("CIMseq", "CIMseq.data", "tidyverse", "scSeqTools", "printr")
purrr::walk(packages, library, character.only = TRUE)
rm(packages)

##DATA
load('../data/CIMseqData.rda')
load('../data/sObj.rda')
#rename classes
renameClasses <- function(class) {
  case_when(
    class == "0" ~ "C.Stem.proximal",
    class == "1" ~ "C.Stem.distal",
    class == "2" ~ "SI.Stem",
    class == "3" ~ "C.Goblet.distal",
    class == "4" ~ "SI.Enterocytes",
    class == "5" ~ "SI.Goblet",
    class == "6" ~ "C.Goblet.proximal",
    class == "7" ~ "Enteroendocrine",
    class == "8" ~ "Tufft",
    class == "9" ~ "SI.Paneth",
    class == "10" ~ "Blood",
    TRUE ~ "error"
  )
}

cOrder <- c(
  "C.Stem.proximal", "C.Goblet.proximal", "C.Stem.distal", "C.Goblet.distal",
  "SI.Goblet", "SI.Paneth", "SI.Stem", "SI.Enterocytes",
  "Enteroendocrine", "Tufft", "Blood"
)

getData(cObjSng, "classification") <- renameClasses(getData(cObjSng, "classification"))
features <- getData(cObjMul, "features")
names(features) <- renameClasses(names(features))
getData(cObjMul, "features") <- features
fractions <- getData(sObj, "fractions")
colnames(fractions) <- renameClasses(colnames(fractions))
sObj@fractions <- fractions

Classification

plotUnsupervisedClass(cObjSng, cObjMul)

plotUnsupervisedMarkers(
  cObjSng, cObjMul,
  c("Lgr5", "Ptprc", "Chga", "Dclk1", "Alpi", "Slc26a3", "Atoh1", "Lyz1"),
  pal = RColorBrewer::brewer.pal(8, "Set1")
)

plotUnsupervisedMarkers(
  cObjSng, cObjMul, "Hoxb13",
  pal = RColorBrewer::brewer.pal(8, "Set1")
)

plotUnsupervisedMarkers(
  cObjSng, cObjMul, "Mki67",
  pal = RColorBrewer::brewer.pal(8, "Set1")
)

classHeatmap(
  data = data.frame(
    gene = rownames(getData(cObjMul, "counts"))[getData(cObjMul, "features")],
    class = names(getData(cObjMul, "features"))
  ),
  counts.log = getData(cObjSng, "counts.log"),
  classes = tibble(
    sample = colnames(getData(cObjSng, "counts")), 
    class = getData(cObjSng, "classification")
  )
)

Show top 10 (by fold change) features per class.

violinPlot <- function(cpm, genes, classes, class) {
  cpm[genes, ] %>%
    t() %>% 
    matrix_to_tibble("sample") %>%
    gather(gene, cpm, -sample) %>%
    full_join(tibble(sample = colnames(cpm), class = classes), by = "sample") %>%
    ggplot() +
    geom_jitter(aes(class, cpm), height = 0, size = 0.1, alpha = 0.25) +
    facet_wrap(~gene, scales = "free_y") +
    theme_bw() +
    theme(axis.text.x = element_text(angle = 90)) +
    labs(title = paste0(class, " genes"))
}


getGenes <- function(
  cpm, classes, features, class, ntop = 10
) {
  fc <- foldChangePerClass(
    cpm[features[names(features) == class], ], 
    tibble(sample = colnames(cpm), class = classes)
  )
  rownames(fc[order(fc[, class], decreasing = TRUE), ])[1:ntop]
}

cpm <- getData(cObjSng, "counts.cpm")
c <- "C.Stem.distal"
g <- getGenes(cpm, getData(cObjSng, "classification"), getData(cObjMul, "features"), c)
violinPlot(cpm, g, getData(cObjSng, "classification"), c)

c <- "C.Stem.proximal"
g <- getGenes(cpm, getData(cObjSng, "classification"), getData(cObjMul, "features"), c)
violinPlot(cpm, g, getData(cObjSng, "classification"), c)

c <- "C.Goblet.distal"
g <- getGenes(cpm, getData(cObjSng, "classification"), getData(cObjMul, "features"), c)
violinPlot(cpm, g, getData(cObjSng, "classification"), c)

c <- "C.Goblet.proximal"
g <- getGenes(cpm, getData(cObjSng, "classification"), getData(cObjMul, "features"), c)
violinPlot(cpm, g, getData(cObjSng, "classification"), c)

c <- "SI.Stem"
g <- getGenes(cpm, getData(cObjSng, "classification"), getData(cObjMul, "features"), c)
violinPlot(cpm, g, getData(cObjSng, "classification"), c)

c <- "SI.Goblet"
g <- getGenes(cpm, getData(cObjSng, "classification"), getData(cObjMul, "features"), c)
violinPlot(cpm, g, getData(cObjSng, "classification"), c)

c <- "SI.Enterocytes"
g <- getGenes(cpm, getData(cObjSng, "classification"), getData(cObjMul, "features"), c)
violinPlot(cpm, g, getData(cObjSng, "classification"), c)

c <- "SI.Paneth"
g <- getGenes(cpm, getData(cObjSng, "classification"), getData(cObjMul, "features"), c)
violinPlot(cpm, g, getData(cObjSng, "classification"), c)

Deconvolution

plotSwarmCircos(sObj, cObjSng, cObjMul, weightCut = 0, classOrder = cOrder)

Filtering

Only detected duplicates and triplicates.
Only ERCC estimated cell number <= 4.
Weight cutoff = 5.

adj <- adjustFractions(cObjSng, cObjMul, sObj, binary = TRUE)
samples <- rownames(adj)
rs <- rowSums(adj)
keep <- rs == 2 | rs == 3

plotSwarmCircos(
  filterSwarm(sObj, keep), cObjSng, cObjMul, weightCut = 5, 
  classOrder = cOrder, theoretical.max = 4
)

False positive stats

efm <- getEdgesForMultiplet(sObj, cObjSng, cObjMul, theoretical.max = 4) %>% 
  mutate(conn = map2(from, to, ~tibble(
    from = sort(c(.x, .y))[1], to = sort(c(.x, .y))[2]
  ))) %>% 
  select(-from, -to) %>% 
  unnest() %>% 
  distinct()

# fp <- efm %>%
#   mutate(fp = case_when(
#     str_detect(from, "^SI") & str_detect(to, "^C") ~ TRUE,
#     str_detect(to, "^SI") & str_detect(from, "^C") ~ TRUE,
#     str_detect(from, "proximal") & str_detect(to, "distal") ~ TRUE,
#     str_detect(to, "proximal") & str_detect(from, "distal") ~ TRUE,
#     TRUE ~ FALSE
#   )) %>%
#   group_by(sample) %>%
#   summarize(fp = sum(fp)) %>%
#   {setNames(pull(., fp), pull(., sample))}

fp <- efm %>%
  inner_join(select(MGA.Meta, sample, sub_tissue)) %>%
  mutate(fp = case_when(
    (str_detect(from, "^C") | str_detect(to, "^C")) & sub_tissue == "small_intestine" ~ TRUE,
    (str_detect(from, "^SI") | str_detect(to, "^SI")) & sub_tissue == "colon" ~ TRUE,
    TRUE ~ FALSE
  )) %>%
  group_by(sample) %>%
  summarize(fp = sum(fp)) %>%
  {setNames(pull(., fp), pull(., sample))}
## Joining, by = "sample"

Detected 508 false positive connections out of 1383 total connections. Of those multiplets with a detected connection, 304 multiplets have at least one false positive out of 1179 total multiplets. The per multiplet false positive distribution is shown in the plot below.

ggplot() +
  geom_bar(aes(fp)) +
  theme_bw() +
  labs(x = "False positives per multiplet") +
  scale_x_continuous(breaks = 0:max(fp))

The type of false positive connections and their amount are shown below.

typeFreq <- efm %>%
  mutate(fp = case_when(
    str_detect(from, "^SI") & str_detect(to, "^C") ~ TRUE,
    str_detect(to, "^SI") & str_detect(from, "^C") ~ TRUE,
    str_detect(from, "proximal") & str_detect(to, "distal") ~ TRUE,
    str_detect(to, "proximal") & str_detect(from, "distal") ~ TRUE,
    TRUE ~ FALSE
  )) %>%
  filter(fp) %>%
  count(from, to) %>%
  arrange(desc(n)) %>%
  as.data.frame()

typeFreq
from to n
C.Stem.distal C.Stem.proximal 312
C.Goblet.distal C.Stem.proximal 218
C.Stem.proximal SI.Stem 159
C.Goblet.distal C.Goblet.proximal 113
C.Goblet.proximal C.Stem.distal 86
C.Stem.proximal SI.Goblet 84
C.Stem.distal SI.Enterocytes 51
C.Stem.distal SI.Goblet 44
C.Goblet.distal SI.Goblet 39
C.Stem.distal SI.Stem 28
C.Stem.proximal SI.Enterocytes 27
C.Goblet.proximal SI.Goblet 24
C.Goblet.distal SI.Stem 12
C.Stem.proximal SI.Paneth 10
C.Goblet.distal SI.Enterocytes 4
C.Goblet.proximal SI.Stem 4
C.Goblet.proximal SI.Enterocytes 2
C.Stem.distal SI.Paneth 1
efm %>%
  inner_join(select(MGA.Meta, sample, sub_tissue)) %>%
  mutate(fp = case_when(
    (str_detect(from, "^C") | str_detect(to, "^C")) & sub_tissue == "small_intestine" ~ TRUE,
    (str_detect(from, "^SI") | str_detect(to, "^SI")) & sub_tissue == "colon" ~ TRUE,
    TRUE ~ FALSE
  )) %>%
  filter(fp) %>%
  count(from, to) %>%
  arrange(desc(n)) %>%
  as.data.frame()
## Joining, by = "sample"
from to n
C.Stem.proximal SI.Stem 159
C.Stem.proximal SI.Goblet 84
C.Stem.distal SI.Enterocytes 51
C.Stem.distal SI.Goblet 44
C.Goblet.distal SI.Goblet 39
C.Stem.distal SI.Stem 28
C.Stem.proximal SI.Enterocytes 27
C.Goblet.proximal SI.Goblet 24
C.Goblet.distal SI.Stem 12
C.Stem.proximal SI.Paneth 10
C.Stem.proximal Tufft 5
C.Goblet.distal SI.Enterocytes 4
C.Goblet.proximal SI.Stem 4
C.Stem.proximal Enteroendocrine 4
C.Stem.distal Enteroendocrine 3
C.Goblet.proximal SI.Enterocytes 2
SI.Enterocytes SI.Goblet 2
Blood C.Stem.distal 1
Blood C.Stem.proximal 1
C.Stem.distal SI.Paneth 1
Enteroendocrine SI.Enterocytes 1
Enteroendocrine SI.Goblet 1
SI.Goblet SI.Stem 1
#number of features corresponding to class
#total sum of counts for features corresponding to class
# features <- getData(cObjMul, "features")
# cpm <- getData(cObjSng, "counts.cpm")
# classifications <- getData(cObjSng, "classification")
# rs <- rowSums(cpm)
# table(names(features))
# 
# geneDat <- tibble(class = names(features), gene = rownames(getData(cObjSng, "counts"))[features]) %>%
#   mutate(all = rs[gene]) %>%
#   mutate(classOnly = map2_dbl(class, gene, function(c, g) {
#     sum(cpm[g, classifications == c])
#   })) %>% 
#   group_by(class) %>%
#   summarize(sum.all = sum(all), sum.classOnly = sum(classOnly)) 
# 
# inner_join(typeFreq, geneDat, by = c("from" = "class")) %>%
#   inner_join(geneDat, by = c("to" = "class")) %>%
#   mutate(
#     sum.all = map2_dbl(sum.all.x, sum.all.y, ~sum(c(.x, .y))),
#     sum.classOnly = map2_dbl(sum.classOnly.x, sum.classOnly.y, ~sum(c(.x, .y)))
#   ) %>%
#   ggplot() +
# geom_point(aes(sum.all, n))
dc <- rowSums(adjustFractions(cObjSng, cObjMul, sObj))
tibble(
  sample = names(fp),
  `False positives (n)` = fp,
  `Deconvolution detected connections (n)` = dc[names(fp)]
) %>% 
  group_by(`Deconvolution detected connections (n)`) %>%
  summarize(`False positives (n)` = sum(`False positives (n)`)) %>%
  as.data.frame()
Deconvolution detected connections (n) False positives (n)
2 142
3 225
4 109
5 32
6 0

Correlation between various parameters and the number of false positives is shown below:

mulNames <- names(fp) #note other multiplets have 0 connections

ec <- estimateCells(cObjSng, cObjMul, warn = FALSE) %>%
  filter(sample %in% mulNames) %>%
  {setNames(pull(., estimatedCellNumber), pull(., sample))}
ec <- ec[mulNames]
dec <- rowSums(adjustFractions(cObjSng, cObjMul, sObj, binary = TRUE))
dec <- dec[mulNames]
c <- getData(sObj, "costs")
c <- c[mulNames]
tc <- colSums(getData(cObjMul, "counts"))
tc <- tc[mulNames]
dg <- apply(getData(cObjMul, "counts"), 2, function(c) length(which(c != 0)))
dg <- dg[mulNames]
feat.do <- getData(cObjMul, "counts.cpm")[features, ] %>%
  matrix_to_tibble("gene") %>%
  gather(sample, cpm, -gene) %>%
  group_by(sample) %>%
  summarize(do = length(which(cpm == 0))) %>%
  {setNames(pull(., do), pull(., sample))}
feat.do <- feat.do[mulNames]

cormat <- cor(matrix(c(fp, ec, dec, c, tc, dg, feat.do), ncol = 7))
corvec <- cormat[2:nrow(cormat), 1]
names(corvec) <- c(
  "ERCC estimated cell number", "Deconvolution estimated cell number", 
  "Deconvolution cost", "Total counts", "Total detected genes", 
  "Drop-outs in features"
)
data.frame(cor = corvec)
cor
ERCC estimated cell number 0.3556341
Deconvolution estimated cell number 0.4766768
Deconvolution cost 0.2056108
Total counts 0.2361513
Total detected genes 0.2969849
Drop-outs in features -0.2511878
cpm <- getData(cObjMul, "counts.cpm")
features <- getData(cObjMul, "features")
cpm[features, ] %>%
  matrix_to_tibble("gene") %>%
  gather(sample, cpm, -gene) %>%
  inner_join(tibble(class = names(features), gene = rownames(cpm)[features])) %>%
  group_by(sample, class) %>%
  summarize(cpm.sum = sum(cpm)) %>%
  ungroup() %>%
  inner_join(tibble(sample = names(fp), fp = fp)) %>%
  ggplot() +
  geom_point(aes(cpm.sum, fp)) +
  facet_wrap(~class)
#multiplets with high fp look to have low expression of the selected features

Distribution of fractions in connections thought to be true vs connections known to be false.

fractions <- getData(sObj, "fractions")
efm %>%
  mutate(fp = case_when(
    str_detect(from, "^SI") & str_detect(to, "^C") ~ TRUE,
    str_detect(to, "^SI") & str_detect(from, "^C") ~ TRUE,
    str_detect(from, "proximal") & str_detect(to, "distal") ~ TRUE,
    str_detect(to, "proximal") & str_detect(from, "distal") ~ TRUE,
    TRUE ~ FALSE
  )) %>%
  mutate(fracs = pmap(list(sample, from, to), function(s, f, t) {
    fractions[s, c(f, t)]
  })) %>%
  unnest() %>%
  ggplot() +
  geom_boxplot(aes(fp, fracs)) +
  labs(x = "False positive", y = "Deconvolution fraction") +
  theme_bw()

Cluster multiplets and look for an overrepresentaion of false positives.

p.dist <- CIMseq::pearsonsDist(getData(cObjMul, "counts.cpm"), CIMseq::selectTopMax(getData(cObjMul, "counts.cpm"), 2000))
set.seed(234899)
u <- uwot::umap(p.dist)
rownames(u) <- colnames(getData(cObjMul, "counts"))
u %>% 
  matrix_to_tibble("sample") %>% 
  inner_join(tibble(sample = names(fp), fp = fp)) %>% 
  mutate(fp.bool = fp > 0) %>% 
  ggplot() + 
  geom_point(aes(V1, V2, colour = fp.bool)) +
  theme_bw() +
  labs(x = "UMAP1", y = "UMAP2") +
  guides(colour = guide_legend(title = "Has false positive?"))
## Joining, by = "sample"

u %>% 
  matrix_to_tibble("sample") %>% 
  inner_join(tibble(sample = names(fp), fp = fp)) %>% 
  ggplot() + 
  geom_point(aes(V1, V2, colour = fp)) +
  theme_bw() +
  scale_colour_viridis_c() +
  labs(x = "UMAP1", y = "UMAP2") +
  guides(colour = guide_legend(title = "False positives (n)"))
## Joining, by = "sample"